Neste documento será realizada a tabulação dos dados para cliassificação de cargas da base REDD.
A janela considerada será de 5 minutos (3000 segundos).
Procedimentos experimentais baseados em http://www.sbrt.org.br/sbrt2017/anais/1570359866.pdf
In [1]:
import warnings
#warnings.filterwarnings("warning")
import traceback
import time
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from matplotlib import rcParams
rcParams['figure.figsize'] = (13, 10)
import pandas as pd
from tqdm import tqdm, tqdm_notebook
#tf.debugging.set_log_device_placement(True)
#print("GPU Available: ", tf.test.is_gpu_available())
In [ ]:
from nilmtk import DataSet
from nilmtk.utils import print_dict
redd = DataSet('datasets/REDD/low_freq.h5')
In [ ]:
# Configurações da amostragem
from datetime import datetime, timedelta
# Informações do CS446 Project : Electric Load Identification using Machine Learning (REDD)
building_idx = 3
set_sampling_rate = 3
start = datetime(2011, 4, 16, 5, 11, 27)
end = datetime(2011, 5, 31, 0, 19, 54)
time_interval_minutes = 5 # Split de amostra
# ... 6 seconds - Imaging Time Series (UK-DALE)
In [ ]:
building_idx = 1
building = redd.buildings[building_idx]
In [ ]:
# Available devices in building
building.elec
In [ ]:
# Defining the fixed time block measurement
redd.set_window(start='2011-04-18', end='2011-04-20')
In [ ]:
# Showing device consumption (inside time block)
num_apps = 20
fig, axes = plt.subplots((num_apps+1)//2,2, figsize=(24, num_apps*2) )
for i in range(num_apps):
e = redd.buildings[1].elec[i+1]
axes.flat[i].plot(e.power_series_all_data(sample_period=3), alpha = 0.6)
axes.flat[i].set_title(e.label(), fontsize = '15')
plt.suptitle('', fontsize = '30')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
In [ ]:
# Intervalos de geracao dos dados
def datetime_range(start, end, delta):
'''
Generating a list of datetime intervals (chunks of energy consumption)
from `start` to `end` at each `delta` units.
'''
current = start
while current < end:
yield current
current += delta
# List of datatimes
dts = [dt.strftime('%Y-%m-%d %H:%M:%S')
for dt in
datetime_range(start,
end,
timedelta(minutes=time_interval_minutes))
]
In [ ]:
# Checking chunks list...
for idx in range(1, len(dts)):
print('de', dts[idx-1], ' a ', dts[idx])
In [ ]:
power = building.elec[1].power_series_all_data(sample_period=set_sampling_rate)
mains1 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Mains 1 orginal shape: ', mains1.shape)
power = building.elec[2].power_series_all_data(sample_period=set_sampling_rate)
mains2 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Mains 2 orginal shape: ', mains2.shape)
power = building.elec[5].power_series_all_data(sample_period=set_sampling_rate)
appliance = pd.DataFrame(data = {"Power": power.values }, index=power.index)
print('Appliance shape: ', appliance.shape)
In [ ]:
tStart
In [ ]:
len(appliance.index)
In [ ]:
# Conjunto de dataframes (chunks de 5 minutos)
dataframes = []
# Iterando sobre blocos de tempos (5 minutos)
for idx in tqdm_notebook(range(1, len(dts))):
tStart = dts[idx-1]
tEnd = dts[idx]
# Intervalo de treino do modelo
redd.set_window(start=tStart, end=tEnd)
try:
print('- Chunk #',idx,': from ', tStart, 'to', tEnd)
dfs = {}
_index = []
for m in building.elec.all_meters():
label = str(m.label()).lower().replace(' ','_') + '_' + str(m.instance())
power = m.power_series_all_data(sample_period=set_sampling_rate)
dfs[label] = pd.DataFrame(data = {"Power": power.values }, index=power.index)
if len(_index) < len(power.index) and ('site_meter' not in label):
_index = power.index
for meter_label in dfs:
#if 'site_meter' in meter_label:
# dfs[meter_label] = dfs[meter_label].reindex(index=_index)
dfs[meter_label] = dfs[meter_label].reindex(index=_index)
dfs[meter_label] = dfs[meter_label]['Power'].values
df = pd.DataFrame(dfs, index = _index)
# power = building.elec[1].power_series_all_data(sample_period=set_sampling_rate)
# mains1 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
# print('Mains 1 orginal shape: ', mains1.shape)
# power = building.elec[2].power_series_all_data(sample_period=set_sampling_rate)
# mains2 = pd.DataFrame(data = {"Power": power.values }, index=power.index)
# print('Mains 2 orginal shape: ', mains2.shape)
# power = building.elec[5].power_series_all_data(sample_period=set_sampling_rate)
# appliance = pd.DataFrame(data = {"Power": power.values }, index=power.index)
# print('Appliance shape: ', appliance.shape)
# # Ajustar timeframes (eletronicos medidos em 3s, contra 1s da rede)
# mains1 = mains1.reindex(index=appliance.index)
# print('---\nMains 1 new shape: ', mains1.shape)
# mains2 = mains2.reindex(index=appliance.index)
# print('Mains 2 new shape: ', mains2.shape)
# # Dataframe da modelagem
# df = pd.DataFrame({
# 'Mains1': mains1["Power"].values,
# 'Mains2': mains2["Power"].values,
# 'Appliance': appliance["Power"].values
# }, index = appliance.index)
print('\n---\nDataframe shape: ', df.shape,'\n')
dataframes.append(df)
except Exception as e:
print(' ----- Error: ', str(e))#str(traceback.format_exc()))
#print(' ----- Não foi possível extrair dados do intervalo!')
In [ ]:
print('Total Chunks:', len(dataframes))
In [ ]:
# Check if the chunk has the valid length
valid_chunk_length = (time_interval_minutes*60)/set_sampling_rate
valid_chunks = [d for d in dataframes if d.shape[0] == valid_chunk_length]
print('Valid Chunks:', len(valid_chunks) )
In [ ]:
# Plotting 5 chunks
for df in tqdm(dataframes):
#df = valid_chunks[i]
# if sum(df['Appliance'].values) == 0:
# fig = plt.figure(figsize=(10,8))
# plt.plot(df['Mains1'].values)
# plt.plot(df['Mains2'].values)
# plt.plot(df['Appliance'].values)
# plt.gca().legend(('Mains1','Mains2', 'Appliance'))
fig = plt.figure(figsize=(20,10))
for column in df.columns:
plt.plot(df[column].values)
plt.gca().legend(df.columns)
break
In [ ]:
final_df = pd.DataFrame()
rows = []
classes = [c for c in dataframes[0].columns if 'site_meter' not in c]
for df in dataframes:
attributes = {
'mean_1': df['site_meter_1'].mean(),
'std_1': df['site_meter_1'].std(),
'max_1': df['site_meter_1'].max(),
'min_1': df['site_meter_1'].min(),
'sum_1': df['site_meter_1'].sum(),
'mean_2': df['site_meter_2'].mean(),
'std_2': df['site_meter_2'].std(),
'max_2': df['site_meter_2'].max(),
'min_2': df['site_meter_2'].min(),
'sum_2': df['site_meter_2'].sum()
}
labels = {}
for c in classes:
labels[c] = 1 if df[c].sum() > 0 else 0
final_df = final_df.append({**attributes, **labels}, ignore_index=True)
final_df = final_df[ list(attributes.keys()) + list(labels.keys()) ]
In [ ]:
final_df.head(10)
In [ ]:
final_df.describe()
In [ ]:
final_df.to_csv('df_building_1_statistics_features.csv')
In [ ]:
# TODO:
#- Validar metodologia
#- Validar chunks gerados (noralizar erros)
#- Rotular base (labels binários por dispositivo)
In [ ]:
final_df[final_df.columns[:10]].head()
In [ ]:
In [ ]:
from skmultilearn.adapt import MLkNN
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
scores = cross_val_score(
MLkNN(k=3),
final_df[final_df.columns[:5]].values,
final_df[final_df.columns[10:]].values,
scoring = 'f1_micro',
cv=5,
n_jobs = 8
)
scores.mean()
In [ ]:
from sklearn.metrics import make_scorer, hamming_loss
hamming_score = make_scorer(hamming_loss)
scores = cross_val_score(
MLkNN(k=3),
final_df[final_df.columns[:5]].values,
final_df[final_df.columns[10:]].values,
scoring = hamming_score,
cv=5,
n_jobs = 8
)
scores.mean()
In [ ]:
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
scores = cross_val_score(
RandomForestClassifier(n_estimators=1000),
final_df[final_df.columns[:5]].values,
final_df[final_df.columns[10:]].values,
scoring = hamming_score,
cv=5,
n_jobs = 8
)
scores.mean()
In [ ]:
In [ ]:
#dates = [str(dt).split(' ')[0] for dt in df.index]
dates = [str(time)[:10] for time in df.index.values]
dates = sorted(list(set(dates)))
print('Os dados da Residência modelada contém medições de {1} dia(s) (de {2} a {3}).'.format(i,len(dates),dates[0], dates[-1]))
In [ ]:
# Split de treino, teste e validação
df1_train = df.loc[:dates[10]]
df1_val = df.loc[dates[11]:dates[16]]
df1_test = df.loc[dates[17]:]
print('df_train.shape: ', df1_train.shape)
print('df_val.shape: ', df1_val.shape)
print('df_test.shape: ', df1_test.shape)
In [ ]:
# Usando a corrente 1 e 2 (variaveis independetes) para a previsão do refrigerador (variavel dependente)
X_train = df1_train[['Mains1','Mains2']].values
y_train = df1_train['Appliance'].values
X_test = df1_test[['Mains1','Mains2']].values
y_test = df1_test['Appliance'].values
X_val = df1_val[['Mains1','Mains2']].values
y_val = df1_val['Appliance'].values
print(
'Train: ', X_train.shape, y_train.shape, '\n',
'Test: ', X_val.shape, y_val.shape, '\n',
'Validation: ', X_test.shape, y_test.shape
)
In [ ]:
# Metrcas de avaliação da regressão
def mse_loss(y_predict, y):
return np.mean(np.square(y_predict - y))
def mae_loss(y_predict, y):
return np.mean(np.abs(y_predict - y))
In [ ]:
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model
def build_fc_model(layers):
fc_model = Sequential()
for i in range(len(layers)-1):
#fc_model.add( Dense(input_dim=layers[i], output_dim= layers[i+1]) )#, W_regularizer=l2(0.1)) )
fc_model.add( Dense(input_shape=(layers[i],), units = layers[i+1]) )#, W_regularizer=l2(0.1)) )
fc_model.add( Dropout(0.5) )
if i < (len(layers) - 2):
fc_model.add( Activation('relu') )
fc_model.build()
fc_model.summary()
plot_model(fc_model)
return fc_model
fc_model_1 = build_fc_model([2, 256, 512, 1024, 1])
In [ ]:
adam = Adam(lr = 1e-5)
fc_model_1.compile(loss='mean_squared_error', optimizer=adam)
start = time.time()
model_path = "./resources/mlp_fridge_h1.hdf5"
checkpointer = ModelCheckpoint(filepath=model_path, verbose=0, save_best_only=True)
hist_fc_1 = fc_model_1.fit( X_train, y_train,
batch_size=512, verbose=1, nb_epoch=200,
validation_split=0.33, callbacks=[checkpointer])
print('Tempo total de treinamento do modelo (s):', round(time.time() - start, 0))
In [ ]:
import numpy as np
fc_model = load_model(model_path)
pred_fc = fc_model.predict(X_test).reshape(-1)
mse_loss_fc = mse_loss(pred_fc, y_test)
mae_loss_fc = mae_loss(pred_fc, y_test)
print('MSE no conjunto de teste: ', mse_loss_fc)
print('MAE no conjunto de teste:', mae_loss_fc)
In [ ]:
train_loss = hist_fc_1.history['loss']
val_loss = hist_fc_1.history['val_loss']
def plot_losses(train_loss, val_loss):
plt.rcParams["figure.figsize"] = [24,10]
plt.title('MSE dos conjuntos de treino e teste - Resid. 1')
plt.plot( range(len(train_loss)), train_loss, color = 'b', alpha = 0.6, label='loss (treino)' )
plt.plot( range(len( val_loss )), val_loss, color = 'r', alpha = 0.6, label='loss (validação)' )
plt.xlabel( 'época' )
plt.ylabel( 'loss' )
plt.legend()
plot_losses(train_loss, val_loss)
In [ ]:
# Plotando os cnsumos REAL e o PREVISTO do refrigerador nos 6 dias dos dados de teste
def plot_each_app(df, dates, predict, y_test, title, look_back = 0):
num_date = len(dates)
fig, axes = plt.subplots(num_date,1,figsize=(24, num_date*5) )
plt.suptitle(title, fontsize = '25')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
for i in range(num_date):
if i == 0: l = 0
ind = df.ix[dates[i]].index[look_back:]
axes.flat[i].plot(ind, y_test[l:l+len(ind)], color = 'blue', alpha = 0.6, label = 'REAL')
axes.flat[i].plot(ind, predict[l:l+len(ind)], color = 'red', alpha = 0.6, label = 'PREVISTO')
axes.flat[i].legend()
l = len(ind)
plot_each_app(df1_test, dates[17:], pred_fc, y_test,
'Rede Neural FC: Real e Previsão nos 6 dias do Conjunto de Teste da Resid. 1', look_back = 50)
In [ ]:
# Testar FC mais complexa
In [ ]:
def build_lstm_model(layers):
# #fc_model.add( Dense(input_dim=layers[i], output_dim= layers[i+1]) )#, W_regularizer=l2(0.1)) )
# fc_model.add( Dense(input_shape=(layers[i],), units = layers[i+1]) )#, W_regularizer=l2(0.1)) )
# fc_model.add( Dropout(0.5) )
# if i < (len(layers) - 2):
# fc_model.add( Activation('relu') )
model = Sequential()
for i in range(len(layers) - 2):
if i == 0:
model.add(Embedding(input_dim=layers[i], output_dim=layers[i+1]))
else:
model.add(LSTM(
input_shape=(layers[i],),
units=layers[i+1],
return_sequences = True if i < len(layers) - 3 else False ))
model.add(Dropout(0.3))
model.add(Dense(layers[-1]))
model.build()
model.summary()
plot_model(model)
return model
model = build_lstm_model([2,64,128,256, 1])
In [ ]:
# Utilizando 50 registros de consumos para retreinar o modelo, e prever o consumo de energia de cada aparelho
def process_data(df, dates, x_features, y_features, look_back = 50):
i = 0
for date in dates:
data = df.loc[date]
len_data = data.shape[0]
x = np.array([data[x_features].values[i:i+look_back]
for i in range(len_data - look_back) ]).reshape(-1,look_back, 2)
y = data[y_features].values[look_back:,:]
if i == 0:
X = x
Y = y
else:
X = np.append(X, x, axis=0)
Y = np.append(Y, y, axis=0)
i += 1
return X,Y
start = time.time()
X_train, y_train = process_data(df, dates[:17], ['Mains1','Mains2'], df.columns.values[2:])
X_test, y_test = process_data(df, dates[17:], ['Mains1','Mains2'], df.columns.values[2:])
print('Tempo de execução total (s): ', time.time() - start)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
In [ ]:
start = time.time()
adam = Adam(lr = 5e-5)
lstm_model_path = "./resources/lstm_model.hdf5"
model.compile(loss='mean_squared_error', optimizer=adam)
checkpointer = ModelCheckpoint(filepath=lstm_model_path, verbose=0, save_best_only=True)
hist_lstm = model.fit(
X_train,
y_train[:,2],
batch_size=512,
verbose=1,
nb_epoch=200,
validation_split=0.3,
callbacks=[checkpointer])
print('Tempo de treino (s): ', time.time() - start)
In [ ]:
y_train
In [ ]: